Exploratory Data Analysis - Rainfall

1 Import Packages

pacman::p_load(tidyverse, readr, psych, st, stars, tmap, sf,
               ggstatsplot, plotly, ggplot2, ggdist, dplyr, ggiraph)

2 Load the prepared files

Let’s load the RDS files after data preparation.

rainfall <- readRDS("data/rainfall.rds")
describe(rainfall)
                 vars    n    mean     sd median trimmed    mad  min    max
Station*            1 5547   10.51   5.02   12.0   10.69   5.93    1   18.0
Region*             2 5547    3.51   1.39    4.0    3.63   1.48    1    5.0
Year                3 5547 2006.25  12.34 2010.0 2007.10  13.34 1980 2023.0
Month*              4 5547    6.52   3.45    7.0    6.52   4.45    1   12.0
Date                5 5547     NaN     NA     NA     NaN     NA  Inf   -Inf
TotalRainfall       6 5547  205.65 110.97  190.6  197.50 103.93    0  986.5
TotalRainfall30     7 5547   36.09  62.52    0.0   22.29   0.00    0  320.6
TotalRainfall60     8 5547   45.01  78.85    0.0   27.38   0.00    0  424.0
TotalRainfall120    9 5547   51.38  90.45    0.0   31.05   0.00    0  493.2
                 range  skew kurtosis   se
Station*          17.0 -0.24    -1.28 0.07
Region*            4.0 -0.39    -1.25 0.02
Year              43.0 -0.50    -0.99 0.17
Month*            11.0  0.00    -1.22 0.05
Date              -Inf    NA       NA   NA
TotalRainfall    986.5  0.85     1.51 1.49
TotalRainfall30  320.6  1.64     1.71 0.84
TotalRainfall60  424.0  1.71     2.10 1.06
TotalRainfall120 493.2  1.75     2.35 1.21

3 Map of Singapore

mpsz <- st_read(dsn = "data/geospatial", layer = "MPSZ-2019") %>% 
  st_transform(crs=3414)
Reading layer `MPSZ-2019' from data source 
  `C:\Vanessa\SMU\Term 4 - Visual Analytics & Applications\mvheng\Group11_VAP\EDA\data\geospatial' 
  using driver `ESRI Shapefile'
Simple feature collection with 332 features and 6 fields
Geometry type: MULTIPOLYGON
Dimension:     XY
Bounding box:  xmin: 103.6057 ymin: 1.158699 xmax: 104.0885 ymax: 1.470775
Geodetic CRS:  WGS 84
glimpse(mpsz)
Rows: 332
Columns: 7
$ SUBZONE_N  <chr> "MARINA EAST", "INSTITUTION HILL", "ROBERTSON QUAY", "JURON…
$ SUBZONE_C  <chr> "MESZ01", "RVSZ05", "SRSZ01", "WISZ01", "MUSZ02", "MPSZ05",…
$ PLN_AREA_N <chr> "MARINA EAST", "RIVER VALLEY", "SINGAPORE RIVER", "WESTERN …
$ PLN_AREA_C <chr> "ME", "RV", "SR", "WI", "MU", "MP", "WI", "WI", "SI", "SI",…
$ REGION_N   <chr> "CENTRAL REGION", "CENTRAL REGION", "CENTRAL REGION", "WEST…
$ REGION_C   <chr> "CR", "CR", "CR", "WR", "CR", "CR", "WR", "WR", "CR", "CR",…
$ geometry   <MULTIPOLYGON [m]> MULTIPOLYGON (((33222.98 29..., MULTIPOLYGON (…

Let’s take a look at the planning areas for the 5 regions.

tmap_mode("view")

tm_shape(mpsz) +
  tm_polygons(col = "REGION_N", palette = "Set2")+
  tm_layout(main.title = "Planning Area",
            main.title.position = "left",
            main.title.size = 1,
            legend.show = FALSE,
            frame = FALSE) +
  tmap_options(check.and.fix = TRUE) +
  tm_view(set.zoom.limits = c(11,12))

4 Rainfall analysis

4.1 Analyse rainfall using maps

Let’s map the station to the planning area (PA).

Show the code
station_to_PA <- c(
  "Admiralty" = "WOODLANDS",
  "Ang Mo Kio" = "ANG MO KIO",
  "Boon Lay (East)" = "BOON LAY",
  "Changi" = "CHANGI",
  "Choa Chu Kang (South)" = "CHOA CHU KANG",
  "Clementi" = "CLEMENTI",
  "East Coast Parkway" = "BEDOK",
  "Jurong (West)" = "JURONG WEST",
  "Khatib" = "YISHUN",
  "Marina Barrage" = "DOWNTOWN CORE",
  "Newton" = "NEWTON",
  "Pasir Panjang" = "PASIR PANJANG",
  "Paya Lebar" = "PAYA LEBAR",
  "Seletar" = "SELETAR",
  "Sembawang" = "SEMBAWANG",
  "Tai Seng" = "HOUGANG",
  "Tengah" = "TENGAH",
  "Tuas South" = "TUAS"
)

rainfall$PA <- station_to_PA[rainfall$Station]
rainfall <- rainfall[, c("PA", setdiff(names(rainfall), "PA"))]
head(rainfall)
# A tibble: 6 × 10
  PA        Station  Region  Year Month Date       TotalRainfall TotalRainfall30
  <chr>     <chr>    <chr>  <dbl> <ord> <date>             <dbl>           <dbl>
1 WOODLANDS Admiral… North   2009 Jan   2009-01-01           0.8               0
2 WOODLANDS Admiral… North   2009 Feb   2009-02-01         148                 0
3 WOODLANDS Admiral… North   2009 Mar   2009-03-01         348                 0
4 WOODLANDS Admiral… North   2009 Apr   2009-04-01         149.                0
5 WOODLANDS Admiral… North   2009 May   2009-05-01         206.                0
6 WOODLANDS Admiral… North   2009 Jun   2009-06-01          92                 0
# ℹ 2 more variables: TotalRainfall60 <dbl>, TotalRainfall120 <dbl>
rain_map <- rainfall %>% 
  group_by(PA, Station, Year) %>% 
  summarise(Annual_Rainfall = 
              sum(TotalRainfall, na.rm = TRUE)) %>%
  ungroup()

glimpse(rain_map)
Rows: 469
Columns: 4
$ PA              <chr> "ANG MO KIO", "ANG MO KIO", "ANG MO KIO", "ANG MO KIO"…
$ Station         <chr> "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio", "Ang Mo Kio"…
$ Year            <dbl> 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, …
$ Annual_Rainfall <dbl> 830.6, 2849.2, 3050.4, 2579.6, 3240.0, 1961.4, 2018.4,…
mpsztemp <- left_join(mpsz, rain_map,
                         by = c("PLN_AREA_N" = "PA"))
glimpse(mpsztemp)
Rows: 3,485
Columns: 10
$ SUBZONE_N       <chr> "MARINA EAST", "INSTITUTION HILL", "ROBERTSON QUAY", "…
$ SUBZONE_C       <chr> "MESZ01", "RVSZ05", "SRSZ01", "WISZ01", "MUSZ02", "MPS…
$ PLN_AREA_N      <chr> "MARINA EAST", "RIVER VALLEY", "SINGAPORE RIVER", "WES…
$ PLN_AREA_C      <chr> "ME", "RV", "SR", "WI", "MU", "MP", "WI", "WI", "SI", …
$ REGION_N        <chr> "CENTRAL REGION", "CENTRAL REGION", "CENTRAL REGION", …
$ REGION_C        <chr> "CR", "CR", "CR", "WR", "CR", "CR", "WR", "WR", "CR", …
$ Station         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "Marina Ba…
$ Year            <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 2010, 2011…
$ Annual_Rainfall <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 1112.0, 23…
$ geometry        <MULTIPOLYGON [m]> MULTIPOLYGON (((33222.98 29..., MULTIPOLY…

Let’s plot the annual mean temperature distribution across Singapore.

tm_shape(mpsztemp) +
  tm_polygons(col = "Annual_Rainfall", 
              palette = "Blues", 
              style = "jenks") +
  tm_view(set.zoom.limits = c(11,12))
Note

It seems like the western area of Singapore has more rainfall.

4.2 Rainfall Time Series

4.2.1 Overall - Rainfall Time Series

Show the code
gg <- ggplot(rainfall, aes(x = Date, y = TotalRainfall, 
                         color = factor(Year))) +
    geom_line(linewidth = 0.1) +
    geom_point(aes(text = paste0("Month:", Month, 
                                "<br>Total Rainfall:", TotalRainfall, "mm"))) +
    labs(x = "Year", y = "Monthly Total Rainfall (mm)", color = "Year",
         title = "Trend of Monthly Total Rainfall from 1981 to 2023", 
         caption = "Data from Meteorological Service Singapore website") +
    geom_smooth(method = "lm", 
                se = FALSE, color = "black") +
    theme_minimal() 

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text = 
                        paste0(gg$labels$title, "<br>", "<sup>", 
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")),
           showlegend = FALSE,
    annotations = list(text = gg$labels$caption,
                      xref = "paper", yref = "paper",
                      x = 1000, y = 24,
                      xanchor = "right", yanchor = "top",
                      showarrow = FALSE)) 
Note

We can observe that the trend of rainfall is constant horizontal line, means the rainfall over the years are similar.

4.2.2 Rainfall Time Series by station

Show the code
rain_station <- rainfall %>%
  group_by(Station, Year) %>%
  summarise(rain = sum(TotalRainfall, na.rm = TRUE)) %>%
  ungroup()

rain_station$mean_tooltip <- c(paste0(
  "Year: ", rain_station$Year,
  "\n Station: ", rain_station$Station,
  "\n Total Rainfall: ", rain_station$rain, "mm"))

line <- ggplot(data = rain_station,
               aes(x = Year,
                   y = rain,
                   group = Station,
                   color = Station,
                   data_id = Station)) +
  geom_line_interactive(size = 1.2,
                        alpha = 0.4) +
  geom_point_interactive(aes(tooltip = rain_station$mean_tooltip),
                         fill = "white",
                         size = 1,
                         stroke = 1,
                         shape = 21) +
  theme_classic() +
  ylab("Annual Total Rainfall(mm)") +
  xlab("Year") +
  ggtitle("Annual Total Rainfall") +
  theme(plot.title = element_text(size = 10),
        plot.subtitle = element_text(size = 8)) 

girafe(ggobj = line, 
       width_svg = 8,
       height_svg = 6 * 0.618,
       options = list(
         opts_hover(css = "stroke-width: 2.5; opacity: 1;"),
         opts_hover_inv(css = "stroke-width: 1;opacity:0.6;")))

4.3 Confidence Interval of Total Rainfall

Show the code
rain_yr_error <- rainfall %>%
  group_by(Year) %>%
  summarise(n = n(), rain = sum(TotalRainfall, na.rm = TRUE), 
            sd = sd(TotalRainfall, na.rm = TRUE)) %>%
  mutate(se = sd/sqrt(n-1)) %>% 
  ungroup()

model <- lm(rain ~ Year, rain_yr_error)
y_intercept = coef(model)[1] 
slope_coeff = coef(model)[2]
adjust_yintercept = slope_coeff * 1982 + y_intercept

gg <- ggplot(rain_yr_error) +
       geom_errorbar(aes(x = factor(Year), ymin = rain - 2.58 * se, 
                      ymax = rain + 2.58*se), 
                      width=0.2, colour="black", 
                      alpha=0.9, size=0.5) +
       geom_point(aes(x = factor(Year), y = rain, 
             text = paste0("Year:", `Year`, 
                          "<br>Total Rainfall:", round(rain, digits = 2),
                          "<br>95% CI:[", 
                          round((rain - 2.58 * se), digits = 2), ",",
                          round((rain + 2.58 * se), digits = 2),"]")),
             stat="identity", color="darkred", 
             size = 1.5, alpha = 1) +
       geom_abline(slope = round(slope_coeff, 4), 
                   intercept = adjust_yintercept,
                   untf = TRUE,
                   color = "blue",
                   linetype = "dashed")+
       geom_text(aes(x = 11, y = 27.8, colour = "blue",
                     label = paste0("Rainfall=", 
                                    round(slope_coeff, 4), "* Year ",
                                    round(y_intercept, 4)))) +
       labs (x = "Year", y = "Annual mean temperatures (°C)",
             title = "99% Confidence interval of annual total rainfall by year",
             subtitle = "From 1982 to 2023",
             caption = "Data from Meteorological Service Singapore website") +
       theme_minimal() + 
       theme(axis.text.x = element_text(angle = 45, vjust = 0.5, hjust=1),
             plot.title = element_text(face = "bold", size = 12))

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text = 
                        paste0(gg$labels$title, "<br>", "<sup>", 
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")),
           showlegend = FALSE)
Note

We can observe that the total rainfall over the years have increased.

4.4 Rainfall across the months

4.4.1 Box plot across the months

Show the code
gg <- ggplot(rainfall, 
       aes(x = factor(Month, levels = month.abb), y = TotalRainfall)) +
  geom_violin(color = "navy", fill = "lightblue") +
  geom_hline(data = rainfall, 
             aes(yintercept = mean(TotalRainfall, na.rm = TRUE)),
             linetype = "dashed", size = 1, colour = "brown") +
  geom_text(aes(x = 4.5, y = 27.3, 
                 label = paste0("Total Rainfall : ", 
                                round(sum(TotalRainfall,na.rm = TRUE),2), "mm")), 
            colour = "brown") +
  stat_summary(fun = mean, geom = "point", 
               shape = 20, size = 3, color = "orange",
               aes(text = paste0("Total Rainfall : ",
                                 round(after_stat(y), 2), "mm"))) +
  theme_minimal() +
  labs(title = "Monthly Total Rainfall across each month from 1981 to 2023",
       subtitle = "November to February are cooler as compared to the rest of the year",
        y = "Total Rainfall (mm)",
        x = "Month",
        caption = "Data from Meteorological Service Singapore website")

ggplotly(gg, tooltip = "text") %>%
    layout(title = list(text =
                        paste0(gg$labels$title, "<br>", "<sup>",
                               gg$labels$subtitle, "</sup>"),
                        font = list(weight = "bold")))
Note

We can observe that we have more rain in April, November and December.

4.4.2 Heatmap across the months

Show the code
rain <- rainfall %>% 
        group_by(Year, Month) %>% 
        summarise(TRain = sum(TotalRainfall, na.rm = TRUE))

gg <- ggplot(rain, aes(factor(Month, levels = month.abb), factor(Year), 
                          fill = TRain)) + 
    geom_tile(color = "white",
              aes(text = paste0(Year, "-", Month,
                                "<br>Rainfall:", round(TRain, 2), "°C"))) + 
    theme_minimal() + 
    scale_fill_gradient(name = "Rainfall",
                        low = "sky blue", 
                        high = "dark blue") +
    labs(x = NULL, y = NULL, 
         title = "Total rainfall by year and month")

ggplotly(gg, tooltip = "text")
Note

We can observe that there are more rainfall in the recent years.